CUSTOMER SEGMENTATION - USING K-MEAN CLUSTERING IN PYTHON¶
Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
%matplotlib inline
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import plotly.io as pio
pio.renderers.default='notebook'
Importing Dataset
df = pd.read_csv(r"G:\2025 DS FOLDER\CS - Customer Segmentation\Kmean Clustering in Python\Mall_Customers.csv")
df.head(10)
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
| 5 | 6 | Female | 22 | 17 | 76 |
| 6 | 7 | Female | 35 | 18 | 6 |
| 7 | 8 | Female | 23 | 18 | 94 |
| 8 | 9 | Male | 64 | 19 | 3 |
| 9 | 10 | Female | 30 | 19 | 72 |
df.shape
(200, 5)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Gender 200 non-null object 2 Age 200 non-null int64 3 Annual Income (k$) 200 non-null int64 4 Spending Score (1-100) 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
There are no null values in the dataset. The data is clean and ready for use.
df.describe()
| CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 100.500000 | 38.850000 | 60.560000 | 50.200000 |
| std | 57.879185 | 13.969007 | 26.264721 | 25.823522 |
| min | 1.000000 | 18.000000 | 15.000000 | 1.000000 |
| 25% | 50.750000 | 28.750000 | 41.500000 | 34.750000 |
| 50% | 100.500000 | 36.000000 | 61.500000 | 50.000000 |
| 75% | 150.250000 | 49.000000 | 78.000000 | 73.000000 |
| max | 200.000000 | 70.000000 | 137.000000 | 99.000000 |
#Renaming the columns
df.columns=['CustomerID', 'Gender', 'Age', 'Annual Income','Spending Score']
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Gender 200 non-null object 2 Age 200 non-null int64 3 Annual Income 200 non-null int64 4 Spending Score 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
The column names successfully changed
Peforming EDA - Exporatory Data Analysis
plt.figure(figsize=(18,6))
plt.subplot(1,3,1)
sns.histplot(df['Age'],color = 'skyblue',kde = True)
plt.title('Customer Age', fontsize = 16)
plt.xlabel('Customer Age', fontsize = 14)
plt.ylabel('Frequency', fontsize = 14)
plt.subplot(1,3,2)
sns.histplot(df['Annual Income'],color = 'forestgreen', kde = True)
plt.title('Customer Annual Incoome', fontsize = 16)
plt.xlabel('Income', fontsize = 14)
plt.subplot(1,3,3)
sns.histplot(df['Spending Score'],color = 'royalblue', kde = True)
plt.title('Customer Spending Score', fontsize = 16)
plt.xlabel('Spending Score', fontsize = 14)
plt.show()
plt.figure(figsize=(8,6))
sns.countplot(x='Gender', hue = 'Gender', data=df, palette='winter', legend=False)
plt.title('Count Plot of Gender', fontsize=16)
plt.xlabel('Gender', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks([0,1],['Male','Female'],fontsize=12 )
plt.show()
sns.set_style('white')
new=sns.PairGrid(df.drop(['CustomerID', 'Gender'],axis=1))
new.map(sns.regplot, color='teal')
plt.suptitle('Relationship between Age, Annual Income and Spending Score',y=1.05,fontsize=14)
plt.show()
sns.lmplot(x='Age', y='Annual Income', data=df,fit_reg=False, hue='Gender', height=6, palette='plasma')
plt.title('Relationship between Age, Annual Income and Gender', fontsize=14)
plt.xlabel('Age', fontsize=13)
plt.ylabel('Annual Income', fontsize=13)
plt.show()
sns.lmplot(x='Annual Income', y='Spending Score', data=df,fit_reg=False, hue='Gender', height=6, palette='Dark2')
plt.title('Annual Income vs Spending Score vs Gender', fontsize=14)
plt.xlabel('Annual Income', fontsize=13)
plt.ylabel('Spending Score', fontsize=13)
plt.show()
KMean Clustering Segmentation Based on Age and Spending Score, Finding the Optimal Number of Clusters First, we find the optimal number of clusters by elbow method.
#SETTING THE ENVIRONMENT VARIABLE TO AVOID MEMORY LEAK
os.environ['OMP_NUM_THREADS'] = '1'
# Suppress the specific warning from sklearn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.cluster._kmeans')
#Kmean Clustering
X1=df[['Age','Spending Score']]
inertia1=[]
for n in range(1 , 11):
model1=KMeans(n_clusters=n, init='k-means++', n_init=10, max_iter=300, tol=0.0001, random_state=111,algorithm='lloyd')
model1.fit(X1)
inertia1.append(model1.inertia_)
print(inertia1)
[171535.5, 75949.15601023019, 45840.67661610867, 28165.58356662934, 23818.84825158133, 19510.328802776174, 15514.193134351035, 13012.800984809695, 11513.134979462546, 10590.814352780133]
plt.figure(figsize=(8,6))
plt.plot(list(range(1 , 11)), inertia1, color='royalblue', marker='o',linewidth=2, markersize=12, markerfacecolor='m', markeredgecolor='m')
plt.title('Inertia vs. Number of Clusters',fontsize=18)
plt.xlabel('Number of Clusters',fontsize=15)
plt.ylabel('Inertia',fontsize=15)
plt.show()
We can see that if the number of clusters is smaller than 4, the inertia has a high value but if the number of clusters is larger than 4, the inertia is relatively constant. So we choose 4 as the optimal number of clusters.
#Performing Clustering with the Optimal Number of Clusters - 4
model1=KMeans(n_clusters=4, init='k-means++', n_init=10, max_iter=300, tol=0.0001, random_state=111,algorithm='lloyd')
model1.fit(X1)
labels1=model1.labels_
centroids1=model1.cluster_centers_
# Suppress the specific warning from sklearn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.cluster._kmeans')
Visualizing the Cluster/segmentation
plt.figure(figsize=(11,6))
sns.set_style('white')
plt.scatter(x=df['Age'], y=df['Spending Score'],c=labels1,cmap='winter')
plt.scatter(x = centroids1[: , 0] , y = centroids1[: , 1], s = 300 , c = 'red')
plt.xlabel('Age',fontsize=15)
plt.ylabel('Spending Score',fontsize=15)
plt.title('Customer Segmentation Based on Age and Spending Score',fontsize=18)
plt.show()
Segmentation Based on Annual Income and Spending Score Finding the Optimal Number of Clusters
X2=df[['Annual Income','Spending Score']]
inertia2=[]
for n in range(1 , 11):
model2=KMeans(n_clusters=n, init='k-means++', n_init=10, max_iter=300, tol=0.0001, random_state=111,algorithm='lloyd')
model2.fit(X2)
inertia2.append(model2.inertia_)
print(inertia2)
# Suppress the specific warning from sklearn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.cluster._kmeans')
[269981.28, 181363.59595959596, 106348.37306211119, 73679.78903948834, 44448.45544793371, 37265.86520484346, 30241.343617936585, 25022.485004530354, 21841.97825674864, 19707.258979794773]
Then Lets Plot to see the Point of optimal number cluster
plt.figure(figsize=(8,6))
sns.set_style('white')
plt.plot(list(range(1 , 11)), inertia2, color='forestgreen', marker='o',linewidth=2, markersize=12, markerfacecolor='c', markeredgecolor='c')
plt.title('Inertia vs. Number of Clusters',fontsize=18)
plt.xlabel('Number of Clusters',fontsize=15)
plt.ylabel('Inertia',fontsize=15)
plt.show()
We can see that if the number of clusters is smaller than 5, the inertia has a high value but if the number of clusters is larger than 5, the inertia is relatively constant. So we choose 5 as the optimal number of clusters.
Performing Clustering with the Optimal Number of Clusters
#Performing Clustering with the Optimal Number of Clusters - 5 (ANNUAL INCOME VS SPENDING SCORE)
model2=KMeans(n_clusters=5, init='k-means++', n_init=10, max_iter=300, tol=0.0001, random_state=111,algorithm='lloyd')
model2.fit(X2)
labels2=model2.labels_
centroids2=model2.cluster_centers_
# Suppress the specific warning from sklearn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.cluster._kmeans')
Visualizing the CLuster
plt.figure(figsize=(11,6))
sns.set_style('white')
plt.scatter(x=df['Annual Income'], y=df['Spending Score'],c=labels2,cmap='winter')
plt.scatter(centroids2[:,0], centroids2[:,1],c='red',s=200)
plt.xlabel('Annual Income',fontsize=15)
plt.ylabel('Spending Score',fontsize=15)
plt.title('Customer Segmentation Based on Annual Income and Spending Score',fontsize=18)
plt.show()
Performing Clustering Based on Age, Annual Income and Spending Score
The Determining The Optimal Number of Clusters Based on The Condition Above
X3=df[['Age','Annual Income','Spending Score']]
inertia3=[]
for n in range(1 , 10):
model3=KMeans(n_clusters=n, init='k-means++', n_init=10, max_iter=300, tol=0.0001, random_state=111,algorithm='lloyd')
model3.fit(X3)
inertia3.append(model3.inertia_)
print(inertia3)
# Suppress the specific warning from sklearn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.cluster._kmeans')
[308812.78, 212840.1698209719, 143342.751571706, 104366.15145556198, 75479.76429338778, 58300.44332159069, 51525.773770373766, 44307.87341670445, 40932.6282276547]
Plotting the Inertia against the Number of Cluster to View the number of Clusters
plt.figure(figsize=(8,6))
plt.plot(list(range(1 , 10)), inertia3, color='royalblue', marker='o',linewidth=2, markersize=12, markerfacecolor='violet', markeredgecolor='violet')
plt.title('Inertia vs. Number of Clusters',fontsize=18)
plt.xlabel('Number of Clusters',fontsize=15)
plt.ylabel('Inertia',fontsize=15)
plt.show()
We can see that if the number of clusters is smaller than 6, the inertia has a high value but if the number of clusters is larger than 6, the inertia is relatively constant. So we choose 6 as the optimal number of clusters.
#Performing Clustering with the Optimal Number of Clusters - 6
model3=KMeans(n_clusters=6, init='k-means++', n_init=10, max_iter=300, tol=0.0001, random_state=111,algorithm='lloyd')
model3.fit(X3)
inertia3.append(model3.inertia_)
labels3=model3.labels_
centroids3=model3.cluster_centers_
# Suppress the specific warning from sklearn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.cluster._kmeans')
#Plotting the Cluster
#This plot will reguire 3D plotting.
fig=px.scatter_3d(data_frame=df,x='Age',y='Annual Income',z='Spending Score',color=labels3, color_continuous_scale='bluered')
fig.update_layout(
title={
'text': 'Customer Segmentation Based on Age, Annual Income and Spending Score',
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()